{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "import datasets"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1. FPB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "dic = {\n",
    "    0:\"negative\",\n",
    "    1:'neutral',\n",
    "    2:'positive',\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Found cached dataset financial_phrasebank (/xfs/home/tensor_zy/.cache/huggingface/datasets/financial_phrasebank/sentences_50agree/1.0.0/550bde12e6c30e2674da973a55f57edde5181d53f5a5a34c1531c53f93b7e141)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "99db1928ac754fb5bd0d6b11a28ca581",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['input', 'output', 'instruction'],\n",
       "    num_rows: 3634\n",
       "})"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fpb_datasets = load_dataset(\"financial_phrasebank\", \"sentences_50agree\")\n",
    "fpb_datasets = fpb_datasets[\"train\"]\n",
    "fpb_datasets = fpb_datasets.to_pandas()\n",
    "fpb_datasets.columns = [\"input\", \"output\"]\n",
    "fpb_datasets[\"output\"] = fpb_datasets[\"output\"].apply(lambda x:dic[x])\n",
    "fpb_datasets[\"instruction\"]  = \"What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}.\"\n",
    "fpb_datasets = datasets.Dataset.from_pandas(fpb_datasets)\n",
    "fpb_datasets = fpb_datasets.train_test_split(seed = 42)['train']\n",
    "fpb_datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['input', 'output', 'instruction'],\n",
       "    num_rows: 21804\n",
       "})"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_dataset = datasets.concatenate_datasets([fpb_datasets]*6)   # we want each data source have similar number of samples\n",
    "train_dataset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2. FiQA SA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def make_label(x):\n",
    "    if x < - 0.1: return \"negative\"\n",
    "    elif x >=-0.1 and x < 0.1: return \"neutral\"\n",
    "    elif x >= 0.1: return \"positive\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def add_instructions(x):\n",
    "    if x == \"post\":\n",
    "        return \"What is the sentiment of this tweet? Please choose an answer from {negative/neutral/positive}.\"\n",
    "    else:\n",
    "        return \"What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}.\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Found cached dataset csv (/xfs/home/tensor_zy/.cache/huggingface/datasets/pauri32___csv/pauri32--fiqa-2018-0ac3134eda915d9d/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "095acc75464e4810b964bba6f4aa4b77",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['input', 'output', 'instruction'],\n",
       "    num_rows: 938\n",
       "})"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset = load_dataset('pauri32/fiqa-2018')\n",
    "dataset = datasets.concatenate_datasets([dataset[\"train\"], dataset[\"validation\"] ,dataset[\"test\"] ])\n",
    "dataset = dataset.to_pandas()\n",
    "dataset[\"output\"] = dataset.sentiment_score.apply(make_label)\n",
    "dataset[\"instruction\"] = dataset.format.apply(add_instructions)\n",
    "dataset = dataset[['sentence', 'output',\"instruction\"]]\n",
    "dataset.columns = [\"input\", \"output\",\"instruction\"]\n",
    "dataset = datasets.Dataset.from_pandas(dataset)\n",
    "dataset = dataset.train_test_split(0.226, seed = 42)['train']\n",
    "dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "19698\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['input', 'output', 'instruction'],\n",
       "    num_rows: 41502\n",
       "})"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tmp_dataset = datasets.concatenate_datasets([dataset]*21)\n",
    "train_dataset = datasets.concatenate_datasets([train_dataset, tmp_dataset]) \n",
    "print(tmp_dataset.num_rows)\n",
    "train_dataset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3. TFNS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "dic = {\n",
    "    0:\"negative\",\n",
    "    1:'positive',\n",
    "    2:'neutral',\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Found cached dataset csv (/xfs/home/tensor_zy/.cache/huggingface/datasets/zeroshot___csv/zeroshot--twitter-financial-news-sentiment-467590c9f24f65ad/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "adc5be80faf94904b78caa033c142f02",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['input', 'output', 'instruction'],\n",
       "    num_rows: 9543\n",
       "})"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "social_media_dataset = load_dataset('zeroshot/twitter-financial-news-sentiment')\n",
    "social_media_dataset = social_media_dataset['train']\n",
    "social_media_dataset = social_media_dataset.to_pandas()\n",
    "social_media_dataset['label'] = social_media_dataset['label'].apply(lambda x:dic[x])\n",
    "social_media_dataset['instruction'] = 'What is the sentiment of this tweet? Please choose an answer from {negative/neutral/positive}.'\n",
    "social_media_dataset.columns = ['input', 'output', 'instruction']\n",
    "social_media_dataset = datasets.Dataset.from_pandas(social_media_dataset)\n",
    "social_media_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "19086\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['input', 'output', 'instruction'],\n",
       "    num_rows: 60588\n",
       "})"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tmp_dataset = datasets.concatenate_datasets([social_media_dataset]*2)\n",
    "train_dataset = datasets.concatenate_datasets([train_dataset,tmp_dataset]) \n",
    "print(tmp_dataset.num_rows)\n",
    "train_dataset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4. NWGI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Found cached dataset parquet (/xfs/home/tensor_zy/.cache/huggingface/datasets/oliverwang15___parquet/oliverwang15--news_with_gpt_instructions-ec641c48430028ee/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e24cb16cb385400cbdc53daf965e5abe",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['input', 'output', 'instruction'],\n",
       "    num_rows: 16184\n",
       "})"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "finance_dataset = load_dataset('oliverwang15/news_with_gpt_instructions')\n",
    "finance_dataset = finance_dataset['train'].to_pandas()\n",
    "finance_dataset['output'] = finance_dataset['label']\n",
    "finance_dataset[\"input\"] = finance_dataset[\"news\"]\n",
    "finance_dataset[\"instruction\"] = 'What is the sentiment of this news? Please choose an answer from {strong negative/moderately negative/mildly negative/neutral/mildly positive/moderately positive/strong positive}, then provide some short reasons.'\n",
    "finance_dataset = finance_dataset[['input', 'output', 'instruction']]\n",
    "finance_dataset = datasets.Dataset.from_pandas(finance_dataset)\n",
    "finance_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(76772, 3)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_dataset = datasets.concatenate_datasets([train_dataset, finance_dataset])\n",
    "all_dataset = train_dataset.shuffle(seed = 42)\n",
    "all_dataset.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# from huggingface_hub import notebook_login\n",
    "# notebook_login()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# all_dataset.push_to_hub(\"fingpt_chatglm2_sentiment_instruction_lora_ft_dataset\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "76772"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "21804 + 19698 + 19086 + 16184"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Make Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "from tqdm.notebook import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "def format_example(example: dict) -> dict:\n",
    "    context = f\"Instruction: {example['instruction']}\\n\"\n",
    "    if example.get(\"input\"):\n",
    "        context += f\"Input: {example['input']}\\n\"\n",
    "    context += \"Answer: \"\n",
    "    target = example[\"output\"]\n",
    "    return {\"context\": context, \"target\": target}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_list = []\n",
    "for item in all_dataset.to_pandas().itertuples():\n",
    "    tmp = {}\n",
    "    tmp[\"instruction\"] = item.instruction\n",
    "    tmp[\"input\"] = item.input\n",
    "    tmp[\"output\"] = item.output\n",
    "    data_list.append(tmp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "90d3753764d94213ba2abd248365fc4c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "formatting..:   0%|          | 0/76772 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "with open(\"dataset_new.jsonl\", 'w') as f:\n",
    "    for example in tqdm(data_list, desc=\"formatting..\"):\n",
    "        f.write(json.dumps(format_example(example)) + '\\n')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Tokenize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "from tqdm.notebook import tqdm\n",
    "\n",
    "import datasets\n",
    "from transformers import AutoTokenizer, AutoConfig\n",
    "\n",
    "model_name = \"THUDM/chatglm2-6b\"\n",
    "jsonl_path = \"dataset_new.jsonl\"\n",
    "save_path = 'dataset_new'\n",
    "max_seq_length = 512\n",
    "skip_overlength = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "def preprocess(tokenizer, config, example, max_seq_length):\n",
    "    prompt = example[\"context\"]\n",
    "    target = example[\"target\"]\n",
    "    prompt_ids = tokenizer.encode(prompt, max_length=max_seq_length, truncation=True)\n",
    "    target_ids = tokenizer.encode(\n",
    "        target,\n",
    "        max_length=max_seq_length,\n",
    "        truncation=True,\n",
    "        add_special_tokens=False)\n",
    "    input_ids = prompt_ids + target_ids + [config.eos_token_id]\n",
    "    return {\"input_ids\": input_ids, \"seq_len\": len(prompt_ids)}\n",
    "\n",
    "def read_jsonl(path, max_seq_length, skip_overlength=False):\n",
    "    tokenizer = AutoTokenizer.from_pretrained(\n",
    "        model_name, trust_remote_code=True)\n",
    "    config = AutoConfig.from_pretrained(\n",
    "        model_name, trust_remote_code=True, device_map='auto')\n",
    "    with open(path, \"r\") as f:\n",
    "        for line in tqdm(f.readlines()):\n",
    "            example = json.loads(line)\n",
    "            feature = preprocess(tokenizer, config, example, max_seq_length)\n",
    "            if skip_overlength and len(feature[\"input_ids\"]) > max_seq_length:\n",
    "                continue\n",
    "            feature[\"input_ids\"] = feature[\"input_ids\"][:max_seq_length]\n",
    "            yield feature"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading and preparing dataset generator/default to /xfs/home/tensor_zy/.cache/huggingface/datasets/generator/default-1017a45f33968f36/0.0.0...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "383d652042034e5fafa8289d5e7547e9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4d4a8e3432764635ad80157df5a4ba3a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/76772 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset generator downloaded and prepared to /xfs/home/tensor_zy/.cache/huggingface/datasets/generator/default-1017a45f33968f36/0.0.0. Subsequent calls will reuse this data.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7ce4568c5e6b44a281cbc8aa3431ee63",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Saving the dataset (0/1 shards):   0%|          | 0/76771 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "dataset = datasets.Dataset.from_generator(\n",
    "    lambda: read_jsonl(jsonl_path, max_seq_length, skip_overlength)\n",
    "    )\n",
    "dataset.save_to_disk(save_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "finrl",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
